

This line is required to display the plots in the notebook
%matplotlib inline
D_SAMPLING_FREQUENCY = 250.0
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('ggplot')
def getChannelData(iChannel, strTestCase):
cwd = os.getcwd()
cwd = cwd+'\\TrainingData\\'+strTestCase
f = []
for (dirpath, dirnames, filenames) in os.walk(cwd):
f.extend(filenames)
break
strFileSearch = 'Trace0' + str(iChannel)
strFiles = filter(lambda x:strFileSearch in x, f)
for idx in range(0, len(strFiles)):
fh = open(cwd+'\\'+strFiles[idx], 'rb')
# read the data into numpy
if(idx==0):
xEnd = np.fromfile(fh, dtype=('>f'))
else:
xEnd = np.append(x, np.fromfile(fh, dtype=('>f')))
fh.close()
# We have to switch the underlying NumPy array to native system
# Great write up at: http://pandas.pydata.org/pandas-docs/stable/gotchas.html.
# If you don't do this you get an error: ValueError: Big-endian buffer not supported on little-endian compiler
x = xEnd.byteswap().newbyteorder()
return (x,strFiles)
def padDataFrame(dfData, idx):
for column in dfData:
fTemp = float(dfData[column].iloc[idx:(idx+1)])
dfData[column].iloc[0:idx] = fTemp
fTemp = float(dfData[column].iloc[-idx-1:-idx])
dfData[column].iloc[-idx:] = fTemp
return dfData
def getDataFrameRM(dfData, window_size, bRename=True):
# These lines add the rolling average
dfDataRM = dfData.rolling(window=window_size, center=True).mean()
if( bRename):
dfDataRM.columns = dfDataRM.columns+'_rm'
# zero-indexed, no need to add 1
idx = int(window_size/2)
# Pad with the closest good value
dfDataRM = padDataFrame(dfDataRM, idx)
return dfDataRM
def getDataFrameKR(dfData, window_size):
# These lines add the rolling kurtosis
dfDataKR = dfData.rolling(window=window_size, center=True).kurt()
dfDataKR.columns = dfDataKR.columns+'_kr'
# zero-indexed, no need to add 1
idx = int(window_size/2)
# Pad with the closest good value
dfDataKR = padDataFrame(dfDataKR, idx)
return dfDataKR
def getRMS(data, window_size):
data2 = np.power(data,2)
window = np.ones(window_size)/float(window_size)
return np.sqrt(np.convolve(data2, window, 'same'))
def getDataFrameRMS(dfData, window_size):
dfRMS = dfData.copy(deep=True)
for column in dfData:
data = np.array(dfData[column])
dfRMS[column] = getRMS(data, window_size)
dfRMS.columns = dfData.columns+'_rms'
# zero-indexed, no need to add 1
idx = int(window_size/2)
dfRMS = padDataFrame(dfRMS, window_size)
return dfRMS
def getDataFramePk(dfData, window_size, bRollingMeanOffset = True):
# We want to rectify about the mean
if (bRollingMeanOffset):
mn = getDataFrameRM(dfData, window_size, bRename=False)
else:
mn = dfData.mean()
dfPk = dfData-mn
dfPk = dfPk.abs()
dfPk = dfPk+mn
# Rolling maximum
dfPk = dfPk.rolling(window = window_size, center=True).max()
# correct the column names
dfPk.columns = dfPk.columns+'_pk'
# zero-indexed, no need to add 1
idx = int(window_size/2)
# Pad with the closest good value
dfPk = padDataFrame(dfPk, idx)
return dfPk
def getDataFrameCrest(dfData, dfDataPk, dfDataRMS):
dfCrest = dfDataPk.copy(deep=True)
iCol = len(dfDataPk.columns)
for idxCol in range(0,iCol):
dataPk = np.array(dfDataPk.ix[:,idxCol])
dataRMS = np.array(dfDataRMS.ix[:,idxCol])
dfCrest.ix[:,idxCol] = np.divide(dataPk, dataRMS)
dfCrest.columns = dfData.columns+'_cr'
return dfCrest
def getDataAsFrame(strTestCase, strClass, bScaleData=True):
# Read the data in
(x1,strFiles1) = getChannelData(1,strFolder)
(x2,strFiles2) = getChannelData(2,strFolder)
(x3,strFiles3) = getChannelData(3,strFolder)
(x4,strFiles4) = getChannelData(4,strFolder)
t = np.divide(range(0,len(x1)),D_SAMPLING_FREQUENCY)
# Construct the data frame
dfData = pd.DataFrame(data={('t'):t,
('Ch1'):x1,
('Ch2'):x2,
('Ch3'):x3,
('Ch4'):x4,
'Surface':strClass,
'File':strTestCase})
data_cols = [col for col in dfData.columns if 'Ch' in col]
# Rolling average
window_size_rm = 50
dfDataRM = getDataFrameRM(dfData[data_cols], window_size_rm, bRename=False)
# Rolling mean residual (basically, highpass filtered, high frequency, signal)
dfDataHF = dfData[data_cols] - dfDataRM
dfDataHF.columns = dfDataHF.columns+'_hf'
# Peak of the high frequency signal
window_size_hp_pk = 5
dfDataHFPk = getDataFramePk(dfDataHF, window_size_hp_pk)
# Velocity of high frequency signal
#dfDataVL = (dfData[data_cols] - dfDataRM).cumsum()
dfDataVL = (dfDataRM-dfDataRM.mean()).cumsum()
dfDataVL.columns = dfDataVL.columns+"_vl"
# Now that we are through subtracting, rename the rolling mean columns
dfDataRM.columns = dfDataRM.columns+'_rm'
# Rolling RMS
window_size = 11
dfDataRMS = getDataFrameRMS(dfData[data_cols], window_size)
# Rolling peak value
window_size = 25
dfDataPk = getDataFramePk(dfData[data_cols], window_size)
# Peak value of the rolling mean
window_size_rm_pk = window_size_rm*5
dfDataRM_Pk = getDataFramePk(dfDataRM, window_size_rm_pk, bRollingMeanOffset = True)
# Aggregate the dataframes
dfData = pd.concat([dfData, dfDataRM, dfDataRMS, dfDataHF, dfDataHFPk, dfDataPk, dfDataRM_Pk, dfDataVL],
axis=1, join_axes = [dfData.index])
return dfData
def appendDataAsFrame(strTestCase, dfData, strClass):
dfNew = getDataAsFrame(strTestCase, strClass)
dfDataOut = dfData.append(dfNew)
dfDataOut = dfDataOut.reset_index(drop=True)
return dfDataOut
def plotFeatures(dfDataPlot, strColName):
fig, axes = plt.subplots(nrows=3, ncols=2)
fig.subplots_adjust(wspace=.5, hspace=0.5)
ax = dfDataPlot.plot(x='t', y=[strColName, strColName+'_rm'],
ax=axes[0,0], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName+" Rolling Mean")
ax = dfDataPlot.plot(x='t', y=[strColName, strColName+'_rms'],
ax=axes[0,1], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName+" RMS")
ax = dfDataPlot.plot(x='t', y=[strColName, strColName+'_pk'],
ax=axes[1,0], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName+" Peak")
ax = dfDataPlot.plot(x='t', y=[strColName+"_rm", strColName+'_rm_pk'],
ax=axes[1,1], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName+" Peak of Rolling Mean")
ax = dfDataPlot.plot(x='t', y=[strColName+"_hf", strColName+"_hf_pk"],
ax=axes[2,0], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName+" High-frequency Peak")
ax = dfDataPlot.plot(x='t', y=[strColName+"_vl"],
ax=axes[2,1], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName+" Velocity")
def plotFolder(dfDataPlot):
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.subplots_adjust(wspace=.5, hspace=0.5)
strColName = 'Ch1'
ax = dfDataPlot.plot(x='t', y=[strColName, strColName+'_rm', strColName+'_pk'],
ax=axes[0,0], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName)
strColName = 'Ch2'
ax = dfDataPlot.plot(x='t', y=[strColName, strColName+'_rm', strColName+'_pk'],
ax=axes[0,1], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName)
strColName = 'Ch3'
ax = dfDataPlot.plot(x='t', y=[strColName, strColName+'_rm', strColName+'_pk'],
ax=axes[1,0], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName)
strColName = 'Ch4'
ax = dfDataPlot.plot(x='t', y=[strColName, strColName+'_rm', strColName+'_pk'],
ax=axes[1,1], legend=True, figsize=(10,10))
ax.set_xlabel('Time, seconds')
ax.set_ylabel('Amplitude, ADC counts')
ax.set_title(strColName)
def plotClasses(dfData, strSuff):
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.subplots_adjust(wspace=0.5, hspace=0.5)
strClass = ['Cobble', 'Tile', 'Carpet']
dfData1 = dfData.loc[dfData['Surface'] == strClass[0]]
dfData1.columns = dfData1.columns+'_'+strClass[0]
dfData1 = dfData1.reset_index(drop=True)
dfData2 = dfData.loc[dfData['Surface'] == strClass[1]]
dfData2.columns = dfData2.columns+'_'+strClass[1]
dfData2 = dfData2.reset_index(drop=True)
dfData3 = dfData.loc[dfData['Surface'] == strClass[2]]
dfData3.columns = dfData3.columns+'_'+strClass[2]
dfData3 = dfData3.reset_index(drop=True)
dfDataPlot = pd.concat([dfData1, dfData2, dfData3], axis=1, join_axes=[dfData1.index])
strSeries = ['Ch1_' + strSuff + s for s in strClass]
ax1 = dfDataPlot[strSeries].plot.hist(figsize=(12,12), ax=axes[0, 0], alpha = 0.4, bins=50)
strSeries = ['Ch2_' + strSuff + s for s in strClass]
ax1 = dfDataPlot[strSeries].plot.hist(figsize=(12,12), ax=axes[0, 1], alpha = 0.4, bins=50)
strSeries = ['Ch3_' + strSuff + s for s in strClass]
ax1 = dfDataPlot[strSeries].plot.hist(figsize=(12,12), ax=axes[1, 0], alpha = 0.4, bins=50)
#strSeries = ['Ch4_' + strSuff + s for s in strClass]
#ax1 = dfDataPlot[strSeries].plot.hist(figsize=(12,12), ax=axes[1, 1], alpha = 0.4, bins=50)
def plotCorrChannel(strChannel):
# Section the data and calc correlation matrix
plot_cols = [col for col in dfData.columns if strChannel in col]
dfPlot = dfData[plot_cols]
correlations = dfPlot.corr()
names = list(dfPlot)
iCols = len(dfPlot.columns)
# plot correlation matrix
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,iCols,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()
def plotCorrAccel(dfData):
# Section the data and calc correlation matrix
plot_cols = ['Ch1', 'Ch2', 'Ch3']
dfPlot = dfData[plot_cols]
correlations = dfPlot.corr()
names = list(dfPlot)
iCols = len(dfPlot.columns)
# plot correlation matrix
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,iCols,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()
def plotCorrFeature(dfData, strFeature):
plot_cols=['Tile', 'Carpet', 'Cobble']
dfPlot = pd.DataFrame(data={(strFeature+'_'+plot_cols[0]):np.array(dfData[strFeature].loc[dfData['Surface'] == plot_cols[0]]),
(strFeature+'_'+plot_cols[1]):np.array(dfData[strFeature].loc[dfData['Surface'] == plot_cols[1]]),
(strFeature+'_'+plot_cols[2]):np.array(dfData[strFeature].loc[dfData['Surface'] == plot_cols[2]])})
#fig0 = plt.figure(figsize=(10,10))
#plt.plot(dfPlot[strFeature+'_'+plot_cols[0]])
correlations = dfPlot.corr()
names = list(dfPlot)
iCols = len(dfPlot.columns)
# plot correlation matrix
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,iCols,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()
strClass = 'Cobble'
strFolder = 'Cobble1'
dfData = getDataAsFrame(strFolder, strClass)
strFolder = 'Cobble2'
dfData = appendDataAsFrame(strFolder, dfData, strClass)
strClass = 'Carpet'
strFolder = 'Carpet1'
dfData = appendDataAsFrame(strFolder, dfData, strClass)
strFolder = 'Carpet2'
dfData = appendDataAsFrame(strFolder, dfData, strClass)
strClass = 'Tile'
strFolder = 'Tile1'
dfData = appendDataAsFrame(strFolder, dfData, strClass)
strFolder = 'Tile2'
dfData = appendDataAsFrame(strFolder, dfData, strClass)
plotFeatures(dfData.loc[dfData['Surface'] == 'Carpet'].iloc[:750], strColName="Ch1")
plotFeatures(dfData.loc[dfData['Surface'] == 'Carpet'].tail(750), strColName="Ch1")
data_cols = [col for col in dfData.columns if 'Ch' in col]
dfMean = dfData[data_cols].mean()
dfData[data_cols] = dfData[data_cols]-dfMean
dfStd = dfData[data_cols].std()
dfData[data_cols] = dfData[data_cols]/dfStd
plotCorrAccel(dfData)
strChannel = 'Ch1'
plotCorrChannel(strChannel)
plotFolder(dfData.loc[dfData['File'] == 'Carpet1'])
plt.savefig("Carpet1Timebase.pdf", format='pdf')
plotFolder(dfData.loc[dfData['File'] == 'Tile1'])
plt.savefig("Tile1Timebase.pdf", format='pdf')
plotFolder(dfData.loc[dfData['File'] == 'Cobble1'])
plt.savefig("Cobble1Timebase.pdf", format='pdf')
strSuff = ''
plotClasses(dfData, strSuff)
strSuff = 'rm_'
plotClasses(dfData, strSuff)
strSuff = 'rms_'
plotClasses(dfData, strSuff)
strSuff = 'hf_'
plotClasses(dfData, strSuff)
strSuff = 'hf_pk_'
plotClasses(dfData, strSuff)
strSuff = 'pk_'
plotClasses(dfData, strSuff)
strSuff = 'rm_pk_'
plotClasses(dfData, strSuff)
strSuff = 'vl_'
plotClasses(dfData, strSuff)
plotCorrFeature(dfData, 'Ch1_rm_pk')
from sklearn import svm, datasets
import sklearn.ensemble as ens
import sklearn.tree as trees
import pydotplus as pdot
from sklearn.metrics import confusion_matrix
from IPython.display import Image
import itertools
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# select our data
lstFeatures = ['Ch2_vl','Ch3_vl']
lstClassifiers = ['Tile','Cobble','Carpet']
# For testing of the code we want the same seed
RANDOM_SEED = 10234
# Shuffle the data
idx = np.arange(dfData.shape[0])
np.random.seed(RANDOM_SEED)
np.random.shuffle(idx)
dfDataTrain = dfData.ix[idx[0:(idx.size/2)]]
dfDataTest = dfData.ix[idx[(idx.size/2):idx.size]]
# Break the data into features and classes
TrainFeatures = dfDataTrain[lstFeatures]
TrainClasses = dfDataTrain['Surface']
TestFeatures = dfDataTest[lstFeatures]
TestClasses = dfDataTest['Surface']
# Configure the decision tree and perform the fit
tree_depth=2
dtTrain = trees.DecisionTreeClassifier(max_depth=tree_depth)
dtTrain = dtTrain.fit(TrainFeatures, TrainClasses)
# If you haven't installed "graphviz" you will need to do that.
# I had to infer the ordering of the classifiers and manually
# input them. There must a better way.
dotData = trees.export_graphviz(dtTrain, out_file='None.dot',
feature_names=lstFeatures,
class_names=[lstClassifiers[2], lstClassifiers[1], lstClassifiers[0]],
filled=True, rounded=True,
special_characters=True)
dtGraph = pdot.graph_from_dot_file('None.dot')
Image(dtGraph.create_png())
predTrainClasses = dtTrain.predict(TrainFeatures)
cnm = confusion_matrix(predTrainClasses, TrainClasses, labels=lstClassifiers)
plot_confusion_matrix(cnm, lstClassifiers)
predTestClasses = dtTrain.predict(TestFeatures)
cnm = confusion_matrix(predTestClasses, TestClasses, labels=lstClassifiers)
plot_confusion_matrix(cnm, lstClassifiers)
# mesh for the features
h=0.01
x_min, x_max = TrainFeatures[lstFeatures[0]].min() - 1, TrainFeatures[lstFeatures[0]].max() + 1
y_min, y_max = TrainFeatures[lstFeatures[1]].min() - 1, TrainFeatures[lstFeatures[1]].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
plot_colors = "ryb"
cmapIn = plt.cm.RdYlBu
iClassifiers = len(lstClassifiers)
plt.figure(num=None, figsize=(6, 6))
Z = dtTrain.predict(np.c_[xx.ravel(), yy.ravel()])
dctTemp = {lstClassifiers[0]:0, lstClassifiers[1]:1, lstClassifiers[2]:2}
Zt = np.zeros_like(Z)
for idx in range(0,len(Z)):
Zt[idx] = dctTemp[Z[idx]]
Zt = Zt.reshape(xx.shape)
cs = plt.contourf(xx, yy, Zt, cmap=cmapIn, alpha=0.4)
for i, cIn in zip(xrange(iClassifiers), plot_colors):
idx = np.where(TrainClasses==lstClassifiers[i])[0]
plt.scatter(TrainFeatures.iloc[idx[1:3500],0], TrainFeatures.iloc[idx[1:3500],1], label=lstClassifiers[i], cmap=cmapIn,
c=cIn, edgecolor='black', s=100)
plt.legend()
plt.xlabel(lstFeatures[0])
plt.ylabel(lstFeatures[1])
plt.savefig("Scatter.pdf", format='pdf')
# Configure the decision tree and perform the fit
tree_depth=10
dtTrain3 = trees.DecisionTreeClassifier(max_depth=tree_depth)
dtTrain3 = dtTrain3.fit(TrainFeatures, TrainClasses)
dotData3 = trees.export_graphviz(dtTrain3, out_file='None3.dot',
feature_names=lstFeatures,
class_names=[lstClassifiers[2], lstClassifiers[1], lstClassifiers[0]],
filled=True, rounded=True,
special_characters=True)
dtGraph3 = pdot.graph_from_dot_file('None3.dot')
Image(dtGraph3.create_png())
predTrainClasses3 = dtTrain3.predict(TrainFeatures)
cnm= confusion_matrix(predTrainClasses3, TrainClasses, labels=lstClassifiers)
plot_confusion_matrix(cnm, lstClassifiers)
predTestClasses3 = dtTrain3.predict(TestFeatures)
cnm= confusion_matrix(predTestClasses3, TestClasses, labels=lstClassifiers)
plot_confusion_matrix(cnm, lstClassifiers)